The purpose of this document is to generate differential expression data for the Klotho mice. Based on the analysis in 1.Klotho_Initial_Data_Visualization.Rmd, there isn’t much difference between the heterozygous and homozygous carriers of each variant. Thus, to get differential expression for downstream analysis, we will compare carriers to the WT mice.
Here we generate tables that will be used in a comparison to human differential expression. We will get differential expression values for each gene with the WT as the reference, so that a negative value means that the transcript has reduced abundance in the variant carrier relative to the WT.
rm(list = ls())
library(here)
args <- commandArgs(trailingOnly=T)
comparison.type <- args[1]
age.batch <- args[2]
if(is.na(comparison.type)){
#comparison.type = "Hom" #compares homozygous FC/FC and VS/VS to WT/WT
comparison.type = "Het" #compares homozygous WT/FC and WT/VS to WT/WT
#comparison.type = "Carrier" #compares all carriers of VS and FC alleles to WT/WT
}
if(comparison.type == "Hom"){
vs.allele <- "VS/VS"; fc.allele = "FC/FC"
}
if(comparison.type == "Het"){
vs.allele <- "WT/VS"; fc.allele = "WT/FC"
}
if(comparison.type == "Carrier"){
vs.allele <- "VS"; fc.allele = "FC"
}
if(is.na(age.batch)){
age.batch <- 12
}
#subgroup <- NULL
subgroup <- list("age_batch" = as.numeric(age.batch))
#subgroup <- list("age_batch" = 12)
#subgroup <- list("age_batch" = 4, "sex_ge" = "Female") #example with more than one filter
subgroup.results <- paste(sapply(1:length(subgroup),
function(x) paste(names(subgroup)[x], subgroup[[x]], sep = "_")),
collapse = "_")
results.dir <- here("Results", subgroup.results)
processed.data.dir <- file.path(results.dir, "processed_data")
general.processed.data.dir <- here("Results", "Processed_Data")
general.data.dir <- here("Data", "general")
The analysis compares WT/FC mice and WT/VS mice to WT/WT mice. The age of the mice analyzed is 12 months.
all.fun <- list.files(here("Code"), full.names = TRUE)
for(i in 1:length(all.fun)){source(all.fun[i])}
needed.libraries <- c("synapser", "pheatmap", "DESeq2", "DT", "vioplot", "RColorBrewer",
"gprofiler2", "cluster", "pathview", "clusterProfiler", "stringr", "igraph",
"fgsea")
load_libraries(needed.libraries)
This table uses processed data from 1.Klotho_Initial_Data_Visualization.Rmd
info <- read.table(file.path(processed.data.dir, "mouse_info.csv"), sep = ",", header = TRUE)
raw.expr <- read.table(here("Data", "Mouse", "rsem.merged.gene_counts.tsv"), header = TRUE)
norm.expr <- readRDS(file.path(processed.data.dir, "Normalized_Expression.RDS"))
scaled.expr <- readRDS(file.path(processed.data.dir, "Scaled_Expression.RDS"))
Calculate differential expression for each genotype compared to WT.
gene_t_test <- function(x, y){
varx <- var(x)
vary <- var(y)
if(varx > 0 && vary > 0){
result <- t.test(x,y)
}else{
result <- NA
}
return(result)
}
geno_de <- function(variant = "VS", covar.table, expr.mat){
var.idx <- grep(variant, covar.table[,"genotype"])
#var.names <- covar.table[var.idx,"animalName"]
wt.idx <- which(covar.table[,"genotype"] == "WT/WT")
#wt.names <- covar.table[wt.idx,"animalName"]
#boxplot(list("WT" = expr.mat[2,wt.idx], "Var" = expr.mat[2,var.idx]))
sex.covar <- matrix(as.numeric(as.factor(covar.table[,"sex_ge"]))-1, ncol = 1)
adj.expr <- adjust(t(expr.mat), sex.covar)
test <- apply(adj.expr, 2, function(x) gene_t_test(x[wt.idx], x[var.idx]))
test.info <- t(sapply(test, function(x) if(length(x) > 1){c(x$estimate, x$conf.int, x$p.value)}else{rep(NA, 5)}))
colnames(test.info) <- c("Mean_WT", paste0("Mean_", variant),
"conf_int_wt", paste0("conv_int_", variant), "p")
return(test.info)
}
#This function treats each SNP in the haplotype independently
#In humans there are two haplotypes with two SNPs. Either FC
#or VS. In mice the haplotype is recombined, and they have
#the FS haplotype. This means we can count the number of F
#genotypes at the first position and the number of S genotypes
#at the second position. We will model the effects of the
#SNPs from the human protective haplotype (V and S). To do
#this, we create dummy variables for each SNP and code each
#position depending on the number of V and S alleles in each
#mouse.
dummy_geno <- function(genotypes){
split.geno <- strsplit(genotypes, "/")
numV <- numS <- rep(NA, length(genotypes))
for(i in 1:length(split.geno)){
split.geno[[i]][which(split.geno[[i]] == "WT")] <- "FS"
split.alleles <- unlist(strsplit(split.geno[[i]], "", fixed = TRUE))
numV[i] <- length(which(split.alleles == "V"))
numS[i] <- length(which(split.alleles == "S"))
}
result <- cbind(numV, numS)
colnames(result) <- c("V", "S")
return(result)
}
gene_by_var <- function(covar.table, expr.mat){
dummy.genotype <- dummy_geno(covar.table[,"genotype"])
sex.covar <- matrix(as.numeric(as.factor(covar.table[,"sex_ge"]))-1, ncol = 1)
adj.expr <- adjust(t(expr.mat), sex.covar)
test.df <- data.frame("expr" = adj.expr[,1], "V" = dummy.genotype[,"V"],
"S" = dummy.genotype[,"S"], "Int" = dummy.genotype[,"V"]*dummy.genotype[,"S"]/2)
test <- lm(expr~0+V+S+Int, data = test.df)
test <- apply(adj.expr, 2,
function(x) lm(x~-1+dummy.genotype[,"V"]*dummy.genotype[,"S"]))
test.coef <- t(sapply(test, function(x) x$coef[2:3]))
p <- t(sapply(test, function(x) coef(summary(x))[,"Pr(>|t|)"][2:3]))
f <- lapply(test, function(x) summary(x)$fstatistic)
total.p <- sapply(f, function(x) pf(x[1],x[2],x[3],lower.tail=F))
#qqunif.plot(p[,1])
test.info <- cbind(test.coef, p, total.p)
colnames(test.info) <- c("V_coef", "S_coef", "V_p", "S_p", "model_p")
return(test.info)
}
The QQ plots for the DE p values are shown below. Not too impressive.
par(mfrow = c(1,2))
vs.diff.expr <- geno_de(vs.allele, info, scaled.expr) #can be VS, WT/VS, or VS/VS
qqunif.plot(vs.diff.expr[,"p"], plot.label = "WT vs VS")
#boxplot(vs.diff.expr[,1:2])
write.table(vs.diff.expr, file.path(processed.data.dir,
paste0(gsub("/", "_", vs.allele), "_DEG.csv")), sep = ",", quote = FALSE)
fc.diff.expr <- geno_de(fc.allele, info, scaled.expr) #can be FC, WT/FC, or FC/FC
qqunif.plot(fc.diff.expr[,"p"], plot.label = "WT vs. FC")
#boxplot(fc.diff.expr[,1:2])
write.table(fc.diff.expr, file.path(processed.data.dir,
paste0(gsub("/", "_", fc.allele), "_DEG.csv")), sep = ",", quote = FALSE)
#We tried looking at each allele separately by testing the effects of
#the number of V alleles and the number of S alleles in each mouse.
#but we can't get the interaction between them.
allele.test <- gene_by_var(info, norm.expr)
par(mfrow = c(1,3))
qqunif.plot(allele.test[,"V_p"], plot.label = "V allele")
qqunif.plot(allele.test[,"S_p"], plot.label = "S allele")
qqunif.plot(allele.test[,"model_p"], plot.label = "Overall Model")
Below are the volcano plots for each variant.
par(mfrow = c(1,2))
vs.de <- vs.diff.expr[,2] - vs.diff.expr[,1]
plot(vs.de, -log10(vs.diff.expr[,"p"]), main = paste(vs.allele, "vs. WT"),
ylab = "-log10(pval)", xlab = "Effect Size")
fc.de <- fc.diff.expr[,2] - fc.diff.expr[,1]
plot(fc.de, -log10(fc.diff.expr[,"p"]), main = paste(fc.allele, "vs. WT"),
ylab = "-log10(pval)", xlab = "Effect Size")
The plots below show the VS-FC comparison
par(mfrow = c(1,2))
plot(-log10(vs.diff.expr[,"p"]), -log10(fc.diff.expr[,"p"]),
xlab = paste(vs.allele, "-log10(p)"), ylab = paste(fc.allele, "-log10(p)"))
plot.with.model(vs.de, fc.de, xlab = paste(vs.allele, "vs. WT"),
ylab = paste(fc.allele, "vs. WT"), report = "cor.test")
If we look at genes that are “significantly” differentially expressed in both VS and DE, we see a different picture.
alpha <- 0.05
sig.idx <- intersect(which(vs.diff.expr[,"p"] <= alpha), which(fc.diff.expr[,"p"] <= alpha))
plot(vs.de[sig.idx], fc.de[sig.idx], xlab = paste(vs.allele, "vs. WT"),
ylab = paste(fc.allele, "vs. WT"))
abline(h = 0, v = 0)
sig.vs <- vs.diff.expr[sig.idx,]
sig.fc <- fc.diff.expr[sig.idx,]
get_quadrants <- function(x, y, x.cutoff = 0, y.cutoff = 0){
high.x.high.y <- intersect(which(x > x.cutoff), which(y > y.cutoff))
high.x.low.y <- intersect(which(x > x.cutoff), which(y < y.cutoff))
low.x.high.y <- intersect(which(x < x.cutoff), which(y > y.cutoff))
low.x.low.y <- intersect(which(x < x.cutoff), which(y < y.cutoff))
result <- list("xhigh_yhigh" = high.x.high.y, "xhigh_ylow" = high.x.low.y,
"xlow_yhigh" = low.x.high.y, "xlow_ylow" = low.x.low.y)
return(result)
}
quadrant.idx <- get_quadrants(vs.de[sig.idx], fc.de[sig.idx])
names(quadrant.idx) <- gsub("y", paste0(vs.allele, "_"), gsub("x", paste0(vs.allele, "_"), names(quadrant.idx)))
quadrant.enrich <- lapply(quadrant.idx,
function(x) gost(rownames(sig.vs)[x], organism = "mmusculus",
source = c("GO", "KEGG", "REACTOME", "HP", "CORUM")))
## No results to show
## Please make sure that the organism is correct or set significant = FALSE
#terms <- plot.enrichment.group(quadrant.enrich, return.empty.columns = TRUE, max.term.size = 3000)
terms <- plot.enrichment.group(quadrant.enrich, plot.results = FALSE,
return.empty.columns = TRUE, max.term.size = 3000)
low_high_to_num <- function(vals, low.val = -0.5, high.val = 0.5){
numV <- rep(NA, length(vals))
numV[which(vals == "high")] <- high.val
numV[which(vals == "low")] <- low.val
return(numV)
}
split.names <- strsplit(colnames(terms), "_")
x.pos <- sapply(split.names, function(x) x[2])
y.pos <- sapply(split.names, function(x) x[4])
x.coord <- low_high_to_num(x.pos)
y.coord <- low_high_to_num(y.pos)
plot(0,1, xlim = c(-1, 1), ylim = c(-1, 1), type = "n",
xlab = paste(vs.allele, "vs. WT"), ylab = paste(fc.allele, "vs. WT"),
main = "Enrichments by quadrant")
abline(h = 0, v = 0)
for(i in 1:length(x.pos)){
sig.idx <- which(terms[,i] > 0)
if(length(sig.idx) == 0){
enrich.words = "No Enrichment"
}else{
enrich.words <- words_to_paragraph(names(sig.idx), line.len = 2)
}
plot.text(enrich.words, x = x.coord[i], y = y.coord[i], add = TRUE, cex = 0.7)
}
Instead of using a significance cutoff, we can use GSEA to look at enrichments for the genes that have positive and negative coefficients relative to wild type mice.
We can use several lists for comparison: Biodomains, KEGG pathways, and the intersection between biodomains and KEGG pathways. These were generated by 1.Klotho_Initial_Data_Visualiztion.Rmd and stored in Results/Processed_Data
bd.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_for_GSEA.RDS"))
bd.go.list <- readRDS(file.path(general.processed.data.dir, "Mouse_Biodomains_sub_GO_for_GSEA.RDS"))
kegg.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_for_GSEA.RDS"))
intersect.list <- readRDS(file.path(general.processed.data.dir, "Mouse_KEGG_Intersections_for_GSEA.RDS"))
run_gsea <- function(vals, pos.neg = "pos", gsea.list){
if(pos.neg == "pos"){
idx <- which(vals > 0)
idx.order <- order(vals[idx], decreasing = TRUE)
sorted.vals <- vals[idx[idx.order]]
#plot(sorted.vals)
}
if(pos.neg == "neg"){
idx <- which(vals < 0)
idx.order <- order(abs(vals[idx]), decreasing = TRUE)
sorted.vals <- abs(vals[idx[idx.order]])
#plot(sorted.vals)
}
gsea.enrich <- fgsea::fgsea(gsea.list, sorted.vals, scoreType = "pos")
norm.es <- as.numeric(as.matrix(gsea.enrich[,"NES"]))
pval <- as.numeric(as.matrix(gsea.enrich[,"padj"]))
names(norm.es) <- names(pval) <- gsea.enrich$pathway
result <- cbind(norm.es, pval)
return(result)
}
var.de.list <- list("VS" = vs.de, "FC" = fc.de)
enrich.types <- c("pos", "neg")
combo.names <- paste(rep(names(var.de.list), each = length(enrich.types)), rep(enrich.types, length(var.de.list)), sep = "_")
bd.gsea <- vector(mode = "list", length = length(combo.names))
names(bd.gsea) <- combo.names
idx <- 1
for(v in 1:length(var.de.list)){
for(tp in 1:length(enrich.types)){
bd.gsea[[idx]] <- run_gsea(var.de.list[[v]], enrich.types[tp], bd.list)
idx <- idx + 1
}
}
par(mar = c(4,12,2,2), mfrow = c(2,2))
for(i in 1:length(bd.gsea)){
barplot(sort(bd.gsea[[i]][,1]), las = 2, horiz = TRUE, main = combo.names[i])
}
The heat map below shows the normalized enrichment score for each direction of each variant. The strongest enrichments were upregulated by the FC (common) variant and were tau homeostasis and synapse.
plot_nes <- function(nes.mat, color.scale = "blue", col.text.rotation = 0,
col.text.adj = 0.5, col.text.shift = 0.05, row.text.shift = 0.18,
mat.mar = c(4, 14, 2, 0), bar.mar = c(24, 3,4,4)){
layout(matrix(c(1,2), nrow = 1), widths = c(1, 0.3))
row.order <- hclust(dist(nes.mat))$order
par(mar = mat.mar)
mat.col <- imageWithText(nes.mat[row.order,], col.scale = color.scale, light.dark = "f",
col.text.rotation = col.text.rotation, col.text.adj = col.text.adj,
col.text.shift = col.text.shift, row.text.shift = row.text.shift)
par(mar = bar.mar)
bar.col = imageWithTextColorbar(nes.mat, col.scale = color.scale, light.dark = "f",
cex = 0.7, bar.lwd = 4, round.nearest = 0.1)
}
enrich.bd <- sapply(bd.gsea, function(x) x[,1])
plot_nes(enrich.bd, col.text.rotation = 15)
top.n <- 10
kegg.gsea <- vector(mode = "list", length = length(combo.names))
names(kegg.gsea) <- combo.names
idx <- 1
for(v in 1:length(var.de.list)){
for(tp in 1:length(enrich.types)){
kegg.gsea[[idx]] <- run_gsea(var.de.list[[v]], enrich.types[tp], kegg.list)
idx <- idx + 1
}
}
#par(mar = c(4,12,2,2), mfrow = c(2,2))
#for(i in 1:length(bd.gsea)){
# barplot(sort(kegg.gsea[[i]][,1], decreasing = TRUE)[1:top.n], las = 2, horiz = TRUE, main = combo.names[i])
#}
common.names <- Reduce("intersect", lapply(kegg.gsea, rownames))
enrich.kegg <- matrix(NA, nrow = length(common.names), ncol = 4)
rownames(enrich.kegg) <- common.names
colnames(enrich.kegg) <- c(paste(vs.allele, "up"), paste(vs.allele, "down"), paste(fc.allele, "up"), paste(fc.allele, "down"))
for(i in 1:length(kegg.gsea)){
enrich.kegg[common.names,i] <- kegg.gsea[[i]][common.names, 1]
}
#filter to rows that have at least one value above a threshold
high.idx <- which(apply(enrich.kegg, 1, max) > 1.5)
#pdf("~/Desktop/gsea_kegg.pdf", width = 7, height = 12)
plot_nes(enrich.kegg[high.idx,])
#dev.off()
#pairs(enrich.kegg)
intersect.gsea <- vector(mode = "list", length = length(combo.names))
names(intersect.gsea) <- combo.names
idx <- 1
for(v in 1:length(var.de.list)){
for(tp in 1:length(enrich.types)){
test <- lapply(intersect.list, function(x) run_gsea(var.de.list[[v]], enrich.types[tp], x))
nes <- sapply(test, function(x) x[,"norm.es"])
intersect.gsea[[idx]] <- nes
idx <- idx + 1
}
}
par(mar = c(4,12,2,2), mfrow = c(2,2))
for(i in 1:length(intersect.gsea)){
has.vals <- which(sapply(intersect.gsea[[i]], length) > 0)
score.order <- order(sapply(intersect.gsea[[i]][has.vals], median), decreasing = FALSE)
vioplot(intersect.gsea[[i]][has.vals[score.order]], las = 2, main = combo.names[i], horizontal = TRUE, col = "lightgray")
stripchart(intersect.gsea[[i]][has.vals[score.order]], las = 2, main = combo.names[i], method = "jitter",
pch = 16, col = "darkgray", add = TRUE)
}
unlisted.gsea <- intersect.gsea
for(i in 1:length(unlisted.gsea)){
unlisted.gsea[[i]] <- unlist(intersect.gsea[[i]], recursive = FALSE)
}
common.names <- Reduce("intersect", lapply(unlisted.gsea, names))
enrich.intersect <- matrix(NA, nrow = length(common.names), ncol = 4)
rownames(enrich.intersect) <- common.names
colnames(enrich.intersect) <- c(paste(vs.allele, "up"), paste(vs.allele, "down"), paste(fc.allele, "up"), paste(fc.allele, "down"))
for(i in 1:length(intersect.gsea)){
enrich.intersect[common.names,i] <- unlisted.gsea[[i]][common.names]
}
#filter to rows that have at least one value above a threshold
high.idx <- which(apply(enrich.intersect, 1, max) > 1.9)
#length(high.idx)
#png("~/Desktop/gsea_intersect.png", width = 7, height = 22, units = "in", res = 300)
plot_nes(enrich.intersect[high.idx,], mat.mar = c(4, 22, 2, 0), col.text.rotation = 90,
bar.mar = c(15, 3, 5, 4), col.text.shift = 0.01, col.text.adj = 1)
#dev.off()
The top terms from each group are shown below.
#pdf("~/Desktop/top_enrich.pdf", width = 9, height = 5)
for(i in 1:ncol(enrich.intersect)){
top.terms <- sort(enrich.intersect[,i], decreasing = TRUE)[1:10]
par(mar = c(4, 25, 2, 2))
barplot(rev(top.terms), horiz = TRUE, las = 2, main = colnames(enrich.intersect)[i])
}
#dev.off()